#knitr::opts_chunk$set(include = TRUE, warning = FALSE)
# Pckgs -------------------------------------
#if (!require ("pacman")) (install.packages("pacman"))
#p_install_gh("luisDVA/annotater")
#p_install_gh("HumanitiesDataAnalysis/hathidy")
# devtools::install_github("HumanitiesDataAnalysis/HumanitiesDataAnalysis")
library(here)
library(fs)
library(paint)
library(tidyverse)
library(magrittr)
library(skimr)
library(scales)
library(colorspace)
library(httr)
library(DT) # an R interface to the JavaScript library DataTables
library(knitr)
library(kableExtra)
library(flextable)
library(splitstackshape) #Stack and Reshape Datasets After Splitting Concatenated Values
library(tm) # Text Mining Package
library(tidytext) # Text Mining using 'dplyr', 'ggplot2', and Other Tidy Tools
# this requires pre-requirsites to install : https://github.com/quanteda/quanteda
library(quanteda)
library(igraph)
library(sjmisc) # Data and Variable Transformation Functions
library(ggraph) # An Implementation of Grammar of Graphics for Graphs and Networks
library(widyr) # Widen, Process, then Re-Tidy Data
library(SnowballC) # Snowball Stemmers Based on the C 'libstemmer' UTF-8 Library
# library(#HumanitiesDataAnalysis, # Data and Code for Teaching Humanities Data Analysis
library(sentencepiece) # Text Tokenization using Byte Pair Encoding and Unigram Modelling
library(sysfonts)
library(ggdendro)
library(network)
library(GGally)
library(topicmodels) # with dep FAILED !!!!!!
# extra steo needed to install github version
#if (!require("devtools")) install.packages("devtools")
#library(devtools)
#install_github("husson/FactoMineR") FAILED !!!!!!
# library(FactoMineR)
#library(factoextra)
# Plot Theme(s) -------------------------------------
#source(here("R", "ggplot_themes.R"))
ggplot2::theme_set(theme_minimal())
# color paletts -----
mycolors_gradient <- c("#ccf6fa", "#80e8f3", "#33d9eb", "#00d0e6", "#0092a1")
mycolors_contrast <- c("#E7B800", "#a19100", "#0084e6","#005ca1", "#e60066" )
# Function(s) -------------------------------------
# Data -------------------------------------
# -------------------- {cut bc made too heavy} -------------------------------------
# # Tables [AH knit setup when using kbl() ]------------------------------------
knit_print.data.frame <- function(x, ...) {
res <- paste(c('', '', kable_styling(kable(x, booktabs = TRUE))), collapse = '\n')
asis_output(res)
}
registerS3method("knit_print", "data.frame", knit_print.data.frame)
registerS3method("knit_print", "grouped_df", knit_print.data.frame)Process and merge data - WDR abstracts
Work in progress
World Development Reports (WRDs)
- DATA https://datacatalog.worldbank.org/search/dataset/0037800
- INSTRUCTIONS https://documents.worldbank.org/en/publication/documents-reports/api
- Following: (Kaye 2019; Robinson 2017; Robinson and Silge 2022)
I) Pre-processing
I.ii) – Set stopwords [more…]
# --- alt stop words
# mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
# "fig", "file", "cg", "cb", "cm",
# "ab", "_k", "_k_", "_x"))
# --- set up stop words
stop_words <- as_tibble(stop_words) %>% # in the tidytext dataset
add_row(word = "WDR", lexicon = NA_character_) %>%
# add_row(word = "world", lexicon = NA_character_) %>%
add_row(word = "report", lexicon = NA_character_) %>%
# add_row(word = "development", lexicon = NA_character_) %>%
add_row(word = "1978", lexicon = NA_character_) %>%
add_row(word = "1979", lexicon = NA_character_) %>%
add_row(word = "1980", lexicon = NA_character_) %>%
add_row(word = "1981", lexicon = NA_character_) %>%
add_row(word = "1982", lexicon = NA_character_) %>%
add_row(word = "1983", lexicon = NA_character_) %>%
add_row(word = "1984", lexicon = NA_character_) %>%
add_row(word = "1985", lexicon = NA_character_) %>%
add_row(word = "1986", lexicon = NA_character_) %>%
add_row(word = "1987", lexicon = NA_character_) %>%
add_row(word = "1988", lexicon = NA_character_) %>%
add_row(word = "1989", lexicon = NA_character_) %>%
add_row(word = "1990", lexicon = NA_character_) %>%
add_row(word = "1991", lexicon = NA_character_) %>%
add_row(word = "1992", lexicon = NA_character_) %>%
add_row(word = "1993", lexicon = NA_character_) %>%
add_row(word = "1994", lexicon = NA_character_) %>%
add_row(word = "1995", lexicon = NA_character_) %>%
add_row(word = "1996", lexicon = NA_character_) %>%
add_row(word = "1997", lexicon = NA_character_) %>%
add_row(word = "1998", lexicon = NA_character_) %>%
add_row(word = "1999", lexicon = NA_character_) %>%
add_row(word = "2000", lexicon = NA_character_) %>%
add_row(word = "2001", lexicon = NA_character_) %>%
add_row(word = "2002", lexicon = NA_character_) %>%
add_row(word = "2003", lexicon = NA_character_) %>%
add_row(word = "2004", lexicon = NA_character_) %>%
add_row(word = "2005", lexicon = NA_character_) %>%
add_row(word = "2006", lexicon = NA_character_) %>%
add_row(word = "2007", lexicon = NA_character_) %>%
add_row(word = "2008", lexicon = NA_character_) %>%
add_row(word = "2009", lexicon = NA_character_) %>%
add_row(word = "2010", lexicon = NA_character_) %>%
add_row(word = "2011", lexicon = NA_character_) %>%
add_row(word = "2012", lexicon = NA_character_) %>%
add_row(word = "2013", lexicon = NA_character_) %>%
add_row(word = "2014", lexicon = NA_character_) %>%
add_row(word = "2015", lexicon = NA_character_) %>%
add_row(word = "2016", lexicon = NA_character_) %>%
add_row(word = "2017", lexicon = NA_character_) %>%
add_row(word = "2018", lexicon = NA_character_) %>%
add_row(word = "2019", lexicon = NA_character_) %>%
add_row(word = "2020", lexicon = NA_character_) %>%
add_row(word = "2021", lexicon = NA_character_) %>%
add_row(word = "2022", lexicon = NA_character_) %>%
# filter (word != "changes") %>%
# filter (word != "value") %>%
filter (word != "member") %>%
filter (word != "part") %>%
filter (word != "possible") %>%
filter (word != "point") %>%
filter (word != "present") %>%
# filter (word != "zero") %>%
filter (word != "young") %>%
filter (word != "old") %>%
filter (word != "trying")
# --- set up stop words stemmed
stop_words_stem <- stop_words %>%
mutate (word = SnowballC::wordStem(word ))II) Data (ingestion), loading & cleaning
Ingestion of WDR basic metadata was done in ./_my_stuff/WDR-data-ingestion.Rmd and the result saved as ./data/raw_data/WDR.rds <– (Being somewhat computational intensive, I only did it once.)
- WDR = tibble [45, 8]
- doc_mt_identifier_1 chr oai:openknowledge.worldbank.org:109~
- doc_mt_identifier_2 chr http://www-wds.worldbank.org/extern~
- doc_mt_title chr Development Economics through the ~
- doc_mt_date chr 2012-03-19T10:02:25Z 2012-03-19T19:~
- doc_mt_creator chr Yusuf, Shahid World Bank World Bank~
- doc_mt_subject chr ABSOLUTE POVERTY AGGLOMERATION BENE~
- doc_mt_description chr The World Development Report (WDR) ~
- doc_mt_set_spec chr oai:openknowledge.worldbank.org:109~
Ingestion of WDR lists of subjects was available among metadata but presented issues (difficulty to extract, many records with repetition,apparently wrong) so I reconstructed them manually in data/raw_data/WDR_subjects_corrected2010_2011.xlsx taking them from site https://elibrary.worldbank.org/ which lists keywords correctly e.g. see 2022 WDR
# WRD metadata taken with API get (issues)
WDR <- readr::read_rds(here::here("data", "raw_data", "WDR.rds" )) %>%
# Extract only the portion of string AFTER the backslash {/}
mutate(id = as.numeric(stringr::str_extract(doc_mt_identifier_1, "[^/]+$"))) %>%
dplyr::relocate(id, .before = doc_mt_identifier_1) %>%
mutate(url_keys = paste0("https://openknowledge.worldbank.org/handle/10986/", id , "?show=full")) %>%
# eliminate NON WDR book
dplyr::filter(id != "2586")
# WRD subject/date_issued taken by manual review
WDR_subjects <- readxl::read_excel(here::here("data", "raw_data",
"WDR_subjects_corrected2010_2011.xlsx")) %>%
drop_na(id) %>%
# eliminate NON WDR book
dplyr::filter(id != "2586")
# delete empty cols
ColNums_NotAllMissing <- function(df){ # helper function
as.vector(which(colSums(is.na(df)) != nrow(df)))
}
WDR_subjects <- WDR_subjects %>%
select(ColNums_NotAllMissing(.))
# # convert all columns that start with "subj_" to lowercase
# WDR_subjects[3:218] <- sapply(WDR_subjects[3:218], function(x) tolower(x))
# join
WDR_com <- left_join(WDR, WDR_subjects, by = "id") %>%
dplyr::relocate(date_issued, .before = id ) %>%
# drop useles clmns
dplyr::select(#-doc_mt_identifier_1,
-doc_mt_identifier_2, -doc_mt_date,
-doc_mt_subject, -doc_mt_creator, -doc_mt_set_spec) %>%
# dplyr::relocate(url_keys, .after = subj_216 ) %>%
dplyr::rename(abstract = doc_mt_description) %>%
# correct titles -> portion after {:}
dplyr::mutate(., title = str_extract(doc_mt_title,"[^:]+$")) %>%
dplyr::relocate(title, .after = id) %>%
dplyr::rename(title_miss = doc_mt_title) %>%
dplyr::mutate(title_miss = case_when(
str_starts(title, "World Development Report") ~ "Y",
TRUE ~ NA_character_)
) %>%
dplyr::mutate(subject_miss = if_else(is.na(subj_1),
"Y",
NA_character_)) %>%
dplyr::relocate(subject_miss, .after = title_miss) %>%
dplyr::relocate(ISBN, .after = id)
#paint(WDR_com)
# convert all columns that start with "subj_" to lowercase (maybe redundant)
WDR_com[, grep("^subj_", names(WDR_com))] <- sapply(WDR_com[, grep("^subj_", names(WDR_com))], function(x) tolower(x))
# combine all `subj_...` vars into a vector separated by comma
col_subj <- names(WDR_com[, grep("^subj_", names(WDR_com))] )
WDR_com <- WDR_com %>% tidyr::unite(
col = "all_subj",
subj_1:subj_46,
sep = ",",
remove = FALSE,
na.rm = TRUE) %>%
arrange(date_issued)
#paint(WDR_com)– Some manual correction of wrong metadata
# adding actual titles
#WDR_com[WDR_com$date_issued == "1978", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5961"] <- "Prospects for Growth and Alleviation of Poverty"
#WDR_com[WDR_com$date_issued == "1979", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5962"] <- "Structural Change and Development Policy"
#WDR_com[WDR_com$date_issued == "1980", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5963"] <- "Poverty and Human Development"
#WDR_com[WDR_com$date_issued == "1981", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5964"] <- "National and International Adjustment"
#WDR_com[WDR_com$date_issued == "1982", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5965"] <- "Agriculture and Economic Development"
#WDR_com[WDR_com$date_issued == "1983", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5966"] <- "Management in Development"
#WDR_com[WDR_com$date_issued == "1984", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5967"] <- "Population Change and Development"
#WDR_com[WDR_com$date_issued == "1985", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5968"] <- "International Capital and Economic Development"
#WDR_com[WDR_com$date_issued == "1986", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5969"] <- "Trade and Pricing Policies in World Agriculture"
#WDR_com[WDR_com$date_issued == "1987", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5970"] <- "Industrialization and Foreign Trade"
#WDR_com[WDR_com$date_issued == "1988", c("date_issued", "id", "title")]
WDR_com$title[WDR_com$id == "5971"] <- "Public Finance in Development"
# wrong year
#WDR_com[WDR_com$date_issued %in% c( "2011","2012","2013", "2014", "2015"), c("date_issued", "id", "title")]
WDR_com$date_issued[WDR_com$id == "11843"] <- "2013"
WDR_com$date_issued[WDR_com$id == "16092"] <- "2014"II.i) Troubleshooting some documents
PROBLEM: some of the subjects collections are evidently wrong (either they are the same of another WDR or the list is impossibly long)
MY SOLUTION #1: I took them manually from the website “elibrary” https://elibrary.worldbank.org/action/showPublications?SeriesKey=b02
But, there still is WDR 2011 (“Conflict, Security, and Development”) which misses keywords
MY SOLUTION #2: I take the abstract and I create my own “plausible list” of subjects
— Extrapolate subjects from abstracts - for record with missing subjects/keywords
(*) There will remain a problem: this corrected records have tokens and not bi|n-grams (which make more sense)!
– WDR caseid = 4389
# --- tokenize abstract
WDR_4389 <- WDR_wr %>%
filter(id =="4389") %>%
select (abstract) %>% as_tibble() %>%
tidytext::unnest_tokens(word, abstract) # -> 251 words
# --- remove stop words
# --- isolate meaningful tokens
WDR_4389 <- WDR_4389 %>%
anti_join( stop_words , by = "word") %>% # -> 131 words
# Count observations by group
count(word, sort = T) # -> 101 words
# rename column in result corrected
WDR_4389_w <- t(WDR_4389) %>% as_tibble()
names(WDR_4389_w) <- gsub(x = names(WDR_4389_w), pattern = "\\V", replacement = "subj_")
# --- graph words
# p <- WDR_4389 %>% count(word, sort = TRUE) %>%
# filter(n > 600) %>%
# mutate(word = reorder(word, n)) %>%
# ggplot(aes(n, word)) +
# geom_col() +
# labs(y = NULL)
# p
#
# --- replace as subjects
WDR_4389_w <- WDR_4389_w %>%
filter(row_number() == 1 ) %>%
mutate(id = 4389) %>%
relocate(id, .before = subj_1)
# names(WDR_4389_w)
WDR_4389_w <- WDR_4389_w %>%
tidyr::unite(
col = "all_subj",
subj_1:subj_100,
sep = ",",
remove = FALSE)
#names(WDR_com)
WDR_com_2 <- WDR_com %>%
relocate(subj_1 ,.after = all_subj)
# # create a vector of column names to add to reach the n length(WDR_com)
# col_names <- as.vector(paste0('subj_', length(WDR_4389_w):105))
#
# # make a df with those cols and 1 row made of Nas values
# to_add <- bind_rows(setNames(rep("", length(col_names)), col_names))[NA_character_, ]
#
# # --- pad until subj_216 with nas.........
# WDR_4389_w_pad <- bind_cols(WDR_4389_w, to_add)
#### -- NOW: replace corrected single WDR case into master df
# # initial check ---
# WDR_com$subj_1[WDR_com$id == 4389]
# WDR_com$subj_3[WDR_com$id == 4389]
# #---
#id <- 4389
col_subj_names <- names(WDR_4389_w)[-(1)] # without "id" "all_subj"
# pick the id
i <- 4389
# # --- function --- NO JOY!
# for (j in 1:length(col_subj_names)) {
# col <- col_subj_names[j]
# # print(col) # nolint
# WDR_com %>%
# dplyr::filter (id == i) %>%
# dplyr::mutate (., col = WDR_4389_w_pad$col)
# WDR_com_2 <- WDR_com
# }
# ---------# Solution from SO guy
# r is vectorized so
WDR_com_2[WDR_com_2$id %in% WDR_4389_w$id, 9:55] <- WDR_4389_w[, 2:48] * I cut some …so this record WDR_4389 will be incomplete
II.ii) – SAVE wdr and cleanenv
wdr <- WDR_com_2 %>%
select(-title_miss) %>%
mutate(decade = case_when(
str_detect (string = date_issued, pattern = "^197") ~ "1970s",
str_detect (string = date_issued, pattern = "^198") ~ "1980s",
str_detect (string = date_issued, pattern = "^199") ~ "1990s",
str_detect (string = date_issued, pattern = "^200") ~ "2000s",
str_detect (string = date_issued, pattern = "^201") ~ "2010s",
str_detect (string = date_issued, pattern = "^202") ~ "2020s"
)) %>%
relocate(decade, .after = date_issued) %>%
# correct some datatype
mutate_at(vars(date_issued, altmetric), as.numeric)
dataDir <- fs::path_abs(here::here("data","derived_data"))
fileName <- "/wdr.rds"
Dir_File <- paste0(dataDir, fileName)
write_rds(x = wdr, file = Dir_File)
# # ls objects
# list_old_WDR <- ls(# pattern = "^WDR",
# all.names = TRUE)
# list_old_WDR
# rm(list = setdiff(list_old_WDR, c("stop_words", "stop_words_stem")))I.iii) > > Part of Speech Tagging
Tagging segments of speech for part-of-speech (nouns, verbs, adjectives, etc.) or entity recognition (person, place, company, etc.) https://m-clark.github.io/text-analysis-with-R/part-of-speech-tagging.html
– tagging with cleanNLP
AH: https://datavizs22.classes.andrewheiss.com/example/13-example/#sentiment-analysis
Here’s the general process for tagging (or “annotating”) text with the cleanNLP package:
- Make a dataset where one column is the id (line number, chapter number, book+chapter, etc.), and another column is the text itself.
- Initialize the NLP tagger. You can use any of these:
-
cnlp_init_udpipe(): Use an R-only tagger that should work without installing anything extra (a little slower than the others, but requires no extra steps!) -
cnlp_init_spacy(): Use spaCy (if you’ve installed it on your computer with Python) -
cnlp_init_corenlp(): Use Stanford’s NLP library (if you’ve installed it on your computer with Java)
-
- Feed the data frame from step 1 into the cnlp_annotate() function and wait.
- Save the tagged data on your computer so you don’t have to re-tag it every time.
III) ABSTRACTS
All the following tasks were performend on the abstracts of WDRs. Why?
+ because I needed to learn
+ because abstracts tend to include the ***keywords***
III.i) Tokenization
Where a word is more abstract, a “type” is a concrete term used in actual language, and a “token” is the particular instance we’re interested in (e.g. abstract things (‘wizards’) and individual instances of the thing (‘Harry Potter.’). Breaking a piece of text into words is thus called “tokenization”, and it can be done in many ways.
— The choices of tokenization
- Should words be lowercased? x
- Should punctuation be removed? x
- Should numbers be replaced by some placeholder?
- Should words be stemmed (also called lemmatization). x
- Should bigrams/multi-word phrase be used instead of single word phrases?
- Should stopwords (the most common words) be removed? x
- Should rare words be removed?
— Tokenization using regular expression syntax
The R function strsplit lets us do just this: split a string into pieces. *Note, for example, that this makes the word “Don’t” into two words.
— Tokenization using tidytext
The simplest way is to remove anything that isn’t a letter. The workhorse function in tidytext is unnest_tokens. It creates a new columns (here called ‘words’) from each of the individual ones in text.
abs_1 <- as_tibble(wdr$abstract[1] )
# LIST OF features I can add to `unnest_tokens`
tok_feat_l <- list(
# 1) all 2 lowercase
abs_1 %>% unnest_tokens(word, value) %>% select(lowercase = word),
# 4) `SnowballC::wordStem` extracts stems of each given words in the vector.
abs_1 %>% unnest_tokens(word, value) %>% rowwise() %>%
mutate(word = SnowballC::wordStem(word)) %>% select(stemmed = word),
# 1.b) keep uppercase if there are
abs_1 %>% unnest_tokens(word, value, to_lower = F) %>%
select(uppercase = word),
# 2) keep punctuation {default is rid}
abs_1 %>% unnest_tokens(word, value, to_lower = F, strip_punc = FALSE) %>%
select(punctuations = word),
# 5) bigram
abs_1 %>%
unnest_tokens(word, value, token = "ngrams", n = 2, to_lower = F) %>%
select(bigrams = word)
)
# Return a data frame created by column-binding.
tok_feat_df <- map_dfc(tok_feat_l , ~ .x %>% head(10))
tok_feat_df
# # my choice
# abs_1_t_mod <- abs_1 %>%
# # no punctuation, yes capitalized
# unnest_tokens(word, value, to_lower = F, strip_punc = TRUE) %>% # 249 obs
# # exclude stopwords
# anti_join(stop_words) # 109 obs
#
# head(abs_1_t_mod, 15)— Tokenizing ALL abstracts
# isolate only abstracts
abs_all <- wdr %>%
dplyr::select(id, date_issued, title, abstract)
abs_all_token <- abs_all %>%
unnest_tokens(output = word,
input = abstract ,
to_lower = T, # otherwise cannot match the stop_words
strip_punc = TRUE
) %>% #10018
anti_join(stop_words) # 4613
# Count words
wordcounts <- abs_all_token %>%
group_by(word) %>%
summarize(n = n()) %>%
#arrange(-n) %>%
# head(5) %>%
mutate(rank = rank(-n)) %>%
filter(n > 2, word != "")III.ii) Word and document frequency: TF-IDF
The goal is to quantify what a document is about. What is the document about?
- term frequency (tf) = how frequently a word occurs in a document… but there are words that occur many time and are not important
- term’s inverse document frequency (idf) = decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents.
- statistic tf-idf (= tf-idf) = an alternative to using stopwords is the frequency of a term adjusted for how rarely it is used. [It measures how important a word is to a document in a collection (or corpus) of documents, but it is still a rule-of-thumb or heuristic quantity]
The tf-idf is the product of the term frequency and the inverse document frequency::
\[ \begin{aligned} tf(\text{term}) &= \frac{n_{\text{term}}}{n_{\text{terms in document}}} \\ idf(\text{term}) &= \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)} \\ tf\text{-}idf(\text{term}) &= tf(\text{term}) \times idf(\text{term}) \end{aligned} \]
— Pre-process 4 TF-IDF
# PREP X TF-IDF
#paint(abs_all)
skimr::n_unique(abs_all$title)
skimr::n_unique(abs_all$date_issued)
# Count words (with stopwords)
temp <- abs_all %>%
unnest_tokens(output = word,
input = abstract,
to_lower = T, # otherwise cannot match the stop_words
strip_punc = TRUE) %>% #9769 = tot words
# mutate(word = SnowballC::wordStem(word)) %>%
## other important pre-process step
# mutate(title = factor(title, ordered = TRUE)) %>%
# mutate(date_issued = factor(date_issued, ordered = TRUE)) %>%
## implicit group by ...
group_by(date_issued) %>%
# Count observations by group
count(word, sort = TRUE) %>% # 5860 # unique words
ungroup()
abs_words <- left_join(temp, abs_all, by = "date_issued")
paint(abs_words)modo I) {Kumaran Ponnambalam} Create a Word Frequency Table
# abs_words_freq <- abs_words %>%
# anti_join(stop_words, by= "word" ) %>%
# tidyr::pivot_wider(names_from = word, values_from = n, values_fill = 0)
#
# # skim(abs_words_freq)
#
# #Generate the Document Term matrix
# abs_words_freq_matrix <- as.matrix(abs_words_freq)
#
# abs_words_freq_matrix[ , 'gender']
#
# # .... uses {tm}
#str(abs_words_freq_matrix)modo II) {Julia Silge and David Robinson} tidytext::bind_tf_idf
The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents. Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not too common. Let’s do that now.
### --- abstracts with totals
# Calculate the total appearances of each words per doc
abs_total_words <- abs_words %>%
dplyr::group_by(title) %>%
dplyr::summarise(total = sum(n))
# Join the total appearances of each words per doc
abs_words_T <- left_join(abs_words, abs_total_words) %>%
select(id, date_issued, title, abstract, word, n, total )
# The usual suspects are here, “the”, “and”, “to”, and so forth.
# ggplot(abs_words_T, aes(n/total, fill = title)) +
# geom_histogram(show.legend = FALSE) +
# xlim(NA, 0.01) +
# facet_wrap(~title, ncol = 2, scales = "free_y")
tidytext::bind_tf_idf: Calculate and bind the term frequency and inverse document frequency of a tidy text dataset, along with the product, tf-idf, to the dataset. Each of these values are added as columns. This function supports non-standard evaluation through the tidyeval framework.
abs_words_tf_idf <- abs_words_T %>%
#bind_tf_idf(tbl, term, document, n)
bind_tf_idf( word, title, n) %>% # 5860
# get rid of stopwords anyway &...
anti_join(stop_words, by = "word") %>% # 3703
# fileter most importantly weighted
# filter(tf_idf > 0.01 ) %>% # 2278
arrange(date_issued, desc(tf_idf))
abs_words_tf_idfNotice that idf and thus tf-idf are zero for these extremely common words. These are all words that appear in all docs, so the idf term (which will then be the natural log of 1) is zero.
The inverse document frequency (and thus
tf-idf) is very low (near zero) for words that occur in many of the documents in a collection; this is how this approach decreases the weight for common words. IDF will be a higher number for words that occur in fewer of the documents in the collection.
Let’s look at recurring terms terms with high tf-idf in WDRs.
# wdr[ wdr$id %in% c("4391" ) , c("date_issued", "title" )]
# let's look specifically at "Gender Equality and Development"
tf_idf_2012 <- abs_words_tf_idf %>%
filter(date_issued == "2012") %>%
select(-total) %>%
arrange(desc(tf_idf))
# These words are, as measured by tf-idf, the most important to "Gender Equality and Development" and most readers would likely agree.
tf_idf_2012[tf_idf_2012$word %in% c("gender", "equality", "development") ,]
# # A tibble: 3 × 7
# date_issued title word n tf idf tf_idf
# <chr> <chr> <chr> <int> <dbl> <dbl> <dbl>
# 1 2012 " Gender Equality and Development" gender 13 0.0588 3.09 0.182
# 2 2012 " Gender Equality and Development" equality 6 0.0271 3.78 0.103
# 3 2012 " Gender Equality and Development" development 10 0.0452 0 0 — TF-IDF tables/ viz for selected WDRs
Interestingly, some themes are recurrent in cycles (as per (Yusuf 2008)). So I wanted to check TF_IDF in these “subsets” of WDRs
Poverty
# wdr[ wdr$id %in% c("5961", "5963", "5973", "11856" ) , c("date_issued", "title" )]
# SIMPLE TABLE WITH FILTER
tf_idf_poverty <- abs_words_tf_idf %>%
dplyr::filter(date_issued %in% c("1978", "1980", "1990", "2001")) %>%
dplyr::filter(n > 1) %>%
select(-id, -abstract) %>%
dplyr::arrange(date_issued, desc(tf_idf)) What this TF-IDF measure shows is the specific words that distinguish each WDR in this subset themed on poverty: i.e. the point of tf-idf is to identify words that are important to one document within a collection of documents.
…viz
library(forcats) # Tools for Working with Categorical Variables (Factors)
gg_pov_tfidf <- tf_idf_poverty %>%
mutate (title2 = paste( "WDR of ", date_issued )) %>%
group_by(title2) %>%
slice_max(tf_idf, n = 50) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = title2)) +
geom_col(show.legend = FALSE) +
#my_theme() +
scale_fill_manual(values = mycolors_contrast) + # my palette
# labs(x = "tf-idf for ", y = NULL) +
labs(title= bquote("TF-IDF ranking in 4 WDRs focused on "~bold("poverty topic")),
subtitle="(overall 50 tokens with highest TF-IDF)",
#caption="Source: ????",
x="TF-IDF values",
y=""
) +
facet_wrap(~title2, ncol = 2, scales = "free")
gg_pov_tfidf
gg_pov_tfidf %T>%
ggsave(., filename = here("analysis", "output", "figures", "gg_pov_tfidf.pdf"),
# width = 2.75, height = 1.5, units = "in", device = cairo_pdf
) %>%
ggsave(., filename = here("analysis", "output", "figures", "gg_pov_tfidf.png"),
#width = 2.75, height = 1.5, units = "in", type = "cairo", dpi = 300
)Environment/Climate
…viz
Evident how in the 2010 WDR, words like “warming” and “temperatures” appear, while they were unimportant in the 1992 flagship report.
gg_env_tfidf <- tf_idf_env %>%
mutate (title2 = paste( "WDR of ", date_issued )) %>%
group_by(title2) %>%
slice_max(tf_idf, n = 50) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = title2)) +
geom_col(show.legend = FALSE) +
#my_theme() +
scale_fill_manual(values = mycolors_contrast) + # my palette
# labs(x = "tf-idf for ", y = NULL) +
labs(title= bquote("TF-IDF ranking in 4 WDRs foucused on "~bold("environment/climate change")), #title="TF-IDF ranking in 4 WDRs dedicated to environment/climate change topic",
subtitle="(overall 50 tokens with highest TF-IDF)",
#caption="Source: ????",
x="TF-IDF values",
y=""
) +
facet_wrap(~title2, ncol = 2, scales = "free")
gg_env_tfidf
gg_env_tfidf %T>%
ggsave(., filename = here("analysis", "output", "figures", "gg_env_tfidf.pdf"),
# width = 2.75, height = 1.5, units = "in", device = cairo_pdf
) %>%
ggsave(., filename = here("analysis", "output", "figures", "gg_env_tfidf.png"),
#width = 2.75, height = 1.5, units = "in", type = "cairo", dpi = 300
)Knowledge/data
# skim(abs_words_tf_idf$tf_idf)
# wdr[ wdr$id %in% c("5981", "35218" ) , c("date_issued", "title" )]
# SIMPLE TABLE WITH FILTER
tf_idf_knowl <- abs_words_tf_idf %>%
dplyr::filter(date_issued %in% c("1998", "2021")) %>%
dplyr::filter(n > 1) %>%
select(-id, -abstract) %>%
dplyr::arrange(date_issued, desc(tf_idf))…viz
gg_knowl_tfidf <- tf_idf_knowl %>%
dplyr::filter(date_issued %in% c("1998", "2021")) %>%
mutate (title2 = paste( "WDR of ", date_issued )) %>%
group_by(title2) %>%
slice_max(tf_idf, n = 50) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = title2)) +
geom_col(show.legend = FALSE) +
#my_theme() +
scale_fill_manual(values = mycolors_contrast) + # my palette
# labs(x = "tf-idf for ", y = NULL) +
labs(title= bquote("TF-IDF ranking in 4 WDRs foucused on "~bold("knowledge/data")),
#title="TF-IDF ranking in 4 WDRs dedicated to knowledge/data change topic",
subtitle="(overall 50 tokens with highest TF-IDF)",
#caption="Source: ????",
x="TF-IDF values",
y=""
) +
facet_wrap(~title2, ncol = 2, scales = "free")
gg_knowl_tfidf
gg_knowl_tfidf %T>%
ggsave(., filename = here("analysis", "output", "figures", "gg_knowl_tfidf.pdf"),
# width = 2.75, height = 1.5, units = "in", device = cairo_pdf
) %>%
ggsave(., filename = here("analysis", "output", "figures", "gg_knowl_tfidf.png"),
#width = 2.75, height = 1.5, units = "in", type = "cairo", dpi = 300
)III.iii) Word frequency histogram {meaningless}
— SCHMIDT’s Plotting most frequent words (all abstractS) —————-
http://benschmidt.org/HDA/texts-as-data.html
The simple plot gives a very skewed curve: As always, you should experiment with multiple scales, and especially think about logarithms. Putting logarithmic scales on both axes reveals something interesting about the way that data is structured; this turns into a straight line.
“Zipf’s law:” the most common word is twice as common as the second most common word, three times as common as the third most common word, four times as common as the fourth most common word, and so forth.
# Putting logarithmic scales on both axes
ggplot(wordcounts) +
aes(x = rank, y = n, label = word) +
geom_point(alpha = .3, color = "grey") +
geom_text(check_overlap = TRUE) +
scale_x_continuous(trans = "log") +
scale_y_continuous(trans = "log") +
labs(title = "Zipf's Law",
subtitle="The log-frequency of a term is inversely correlated with the logarithm of its rank.")
# ...the logarithm of rank decreases linearly with the logarithm of count[the logarithm of rank decreases linearily with the logarithm of count.] –> common words are very common indeed, and logarithmic scales are more often appropriate for plotting than linear ones.
— SILGE’s Plotting most frequent words (all abstractS) —————-
- OKKIO n instead of n/total
# abs_words_T --> 5860
# let's eliminate stopwords
abs_words2 <- anti_join(x = abs_words_T, y = stop_words, by= "word" ) %>%
#filter(n > 1) %>%
select(date_issued, title, word, n, total)
abs_words2 #--> 3699
# paint(abs_words2)
# here there is one row for each word-WDR(abs) combination
# `n` is the number of times that word is used in that book and
# `total` is the total words in that abstractfrequency = let’s look at the distribution of n/total for each doc, the number of times a word appears in a doc divided by the total number of terms (words) in that doc
I actually use
ninstead because I have only small numbers having used the abstracts alone
one <- abs_words2 %>%
filter ( date_issued == "2021") %>%
mutate (freq = n/total)
ggplot(data = one,
mapping = aes(x = n, fill = title)) + # y axis not needed ... R will count
geom_histogram(binwidth = 1,
color = "white") +
scale_y_continuous(breaks= pretty_breaks()) +
xlim(0,10) +
labs(# title = title,
x = "frequency",
y = "N of words @ that frequency") +
guides( fill = "none")
# # skim(one$freq)
# ggplot(data = one,
# mapping = aes(x = freq, fill = title)) + # y axis not needed ... R will count
# geom_histogram(binwidth = 1,
# color = "white") +
# scale_y_continuous(breaks= pretty_breaks()) +
# xlim(0, 0.1) +
# labs(# title = title,
# x = "frequency",
# y = "N of words @ that frequency") +
# guides( fill = "none")# overlayed mess!
ggplot(abs_words2, aes(n, fill = title)) +
geom_histogram(binwidth = 1,
color = "white") +
scale_y_continuous(breaks= pretty_breaks()) +
xlim(0, 20) +
labs(#title = ~date_issued,
x = "frequency",
y = "N of words @ that frequency") +
guides( fill = "none")ggplot(abs_words2, aes(n, fill = title)) +
geom_histogram(binwidth = 1,
color = "white") +
scale_y_continuous(breaks= pretty_breaks()) +
xlim(0, 10) +
labs(#title = ~date_issued,
x = "frequency",
y = "N of words @ that frequency") +
facet_wrap( ~date_issued ) + # , ncol = 2, scales = "free_y")
guides( fill = "none") # way to turn legend off— Multiple plots of Word Freq with [purrr]
# ---- NON Capisco
# # Preferred approach
# histos <- abs_words2 %>%
# group_by(title) %>%
# nest() %>%
# mutate(plot = map2(data, title,
# ~ggplot(data = .x , aes(n/total, fill = title)) +
# geom_histogram( show.legend = FALSE) +
# xlim(NA, 0.05) +
# # ggtitle(.y) +
# ylab("Words Frequency") +
# xlab("Distribution per WDR title")))
#
# histos$plot[[1]]
# ---- Capisco
# https://stackoverflow.com/questions/60671725/ggplot-add-title-based-on-the-variable-in-the-dataframe-used-for-plotting
list_plot <- abs_words2 %>%
dplyr::group_split(date_issued) %>% # Split data frame by groups
map(~ggplot(.,
mapping = aes(x = n, fill = title)) +
geom_histogram(binwidth = 1,
color = "white") +
scale_y_continuous(breaks= pretty_breaks()) + # integer ticks {scales}
xlim(0, 18) + # max of freq is 17
labs(title = .$date_issued,
x = "frequency",
y = "N of words @ that frequency")
)
list_plot[[1]]
list_plot[[44]]
list_plot[[40]]
#grid.arrange(grobs = list_plot, ncol = 1)— Zipf’s law for WDR’s abstracts
Examine Zipf’s law for WDR’s abstracts with just a few lines of dplyr functions. > The rank column here tells us the rank of each word within the frequency table; the table was already ordered by n so we could use row_number() to find the rank
Zipf’s law is often visualized by plotting rank on the x-axis and term frequency on the y-axis, on logarithmic scales. Plotting this way, an inversely proportional relationship will have a constant, negative slope.
freq_by_rank %>%
filter (date_issued == "2021" | date_issued == "1998") %>%
ggplot(aes(rank, `term frequency`, color = title)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10() +
labs(title = "Zipf’s law seen for knowledge (1998) & data (2021) WDRs",
subtitle = "(1998) = blue | (2021) = red",
x = "rank (log)",
y = "term frequency (log)",
color = "Legend") https://www.tidytextmining.com/tfidf.html#zipfs-law
perhaps we could view this as a broken power law with, say, three sections. Let’s see what the exponent of the power law is for the middle section of the rank range.
Let’s plot this fitted power law with the obtaied data to see how it looks
freq_by_rank %>%
filter (date_issued == "2021" | date_issued == "1998") %>%
ggplot(aes(rank, `term frequency`, color = title)) +
geom_abline(intercept = -1.80, slope = -0.33,
color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()The deviations we see here at high rank are not uncommon for many kinds of language; a corpus of language often contains fewer rare words than predicted by a single power law.
III.iv) Relationships between words: n-grams and correlations
https://www.tidytextmining.com/ngrams.html https://bookdown.org/Maxine/tidy-text-mining/tokenizing-by-n-gram.html
The one-token-per-row framework can be extended from single words to n-grams and other meaningful units of text(e.g. to see which words tend to follow others immediately, or that tend to co-occur within the same documents.)
-
tidytext::token = "ngrams" argumentis a method tidytext offers for calculating and visualizing relationships between words in your text dataset. It tokenizes by pairs of adjacent words rather than by individual ones. -
ggraphextendsggplot2to construct network plots, -
widyrcalculates pairwise correlations and distances within a tidy data frame.
— Tokenizing by n-gram
The unnest_tokens function can also be used to tokenize into consecutive sequences of words, called n-grams. By seeing how often word X is followed by word Y, we can then build a model of the relationships between them.
# # my choice
# abs_1_t_mod <- abs_1 %>%
# # no punctuation, yes capitalized
# unnest_tokens(word, value, to_lower = F, strip_punc = TRUE) %>% # 249 obs
# # exclude stopwords
# anti_join(stop_words) # 109 obs
abs_all_bigram <- abs_all %>%
unnest_tokens(., output = bigram, input = abstract, token = "ngrams", n=2 )
head(abs_all_bigram[c("date_issued","bigram")], 10)This data structure is still a variation of the tidy text format. It is structured as one-token-per-row but each token now represents a bigram.
— Operations on n-grams: counting and filtering
Not surprisingly, a lot are pairs of stopwords
Here, I can use tidyr::separate(), which splits a column into multiple based on a delimiter. This separate it into two columns, “word1” and “word2”, to then remove cases where either is a stop-word.
In other analyses, we may want to work with the recombined words. tidyr’s unite() function is the inverse of separate(), and lets us recombine the columns into one.
# separate words
bigrams_separated <- abs_all_bigram %>%
tidyr::separate(bigram, c("word1", "word2"), sep = " ")
# since many are bigram with a stopword
bigrams_filtered <- bigrams_separated %>%
# remove cases where either is a stop-word
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# OPPOSITE: reunite words
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")— (Trigrams)
In other analyses you may be interested in the most common trigrams, which are consecutive sequences of 3 words. We can find this by setting n = 3:
— Bigram ~ potential meaningful SLOGANs?
Which of this bigram might be a SLOGAN candidate?
…Maybe some of these bigram with high tf-idf
- external finance
- gender equality
- development impact
- digital revolution
- investment climate
- accelerating growth
- alleviating poverty
- …
— Analyzing bigrams
This one-bigram-per-row format is helpful for exploratory analyses of the text. Let’s see what comes before “poverty”, “change”, “knowledge”…
…or after “human”, “finance”, “bottom”:
— Analyzing bigrams: tf-idf
There are advantages and disadvantages to examining the tf-idf of bigrams rather than individual words. Pairs of consecutive words might capture structure that isn’t present when one is just counting single words, and may provide context that makes tokens more understandable. However, the per-bigram counts are also sparser (a typical two-word pair is rarer than either of its component words).
bigram_tf_idf <- abs_all_bigram %>%
# reconstruct the separated + filtered + united
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
# Put the two word columns back together
unite(bigram, word1, word2, sep = " ") %>%
# then on that calculate tf-idf
count(date_issued, bigram) %>%
bind_tf_idf(bigram, date_issued, n) %>%
arrange(desc(tf_idf))
head(bigram_tf_idf)— (Using bigrams to provide context in sentiment analysis)
…
— Visualizing a network of bigrams with ggraph
The igraph package has many powerful functions for manipulating and analyzing networks.
One way to create an igraph object from tidy data is the igraph::graph_from_data_frame() function, which takes a data frame of edges with columns for “from”, “to”, and edge attributes (in this case “n”):
If vertices is NULL, then the first two columns of df (e.g. word1 = FROM & word2 = TO) are used as a symbolic edge list and additional columns (e.g. n) as edge attributes/weight. The names of the attributes are taken from the names of the columns.
Here, a graph can be constructed from the tidy object bigrams_counts_clean since it has three variables.
library(igraph) # Network Analysis and Visualization
bigrams_counts_clean
# filter for only relatively common combinations
bigram_graph <- bigrams_counts_clean %>%
filter(n > 2) %>%
# create an igraph graph from data frames containing the (symbolic) edge list and edge/vertex attributes.
igraph::graph_from_data_frame()Then we can convert an igraph object into a ggraph with the ggraph function (extension of ggplot2), after which we can add layers to it, much like layers are added in ggplot2. For example, for a basic graph we need to add three layers: “nodes”, “edges”, and “text”.
#convert an igraph object into a ggraph with the ggraph function
library(ggraph) # An Implementation of Grammar of Graphics for Graphs and Networks
set.seed(2022)
ggraph(bigram_graph, layout = "fr") +
# needed basic arguments passed
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)I can already see some common center nodes
We conclude with a few polishing operations to make a better looking graph (Figure 4.5):
- We add the
edge_alphaaesthetic to the link layer to make links transparent based on how common or rare the bigram is (= n) - We add directionality with an arrow, constructed using
grid::arrow(), including anend_capoption that tells the arrow to end before touching the node - We tinker with the options to the node layer to make the nodes more attractive (larger, blue points)
- We add a theme that’s useful for plotting networks,
theme_void()
set.seed(2022)
a <- grid::arrow(type = "closed", length = unit(.08, "inches"))
abs_bigram_graph <- ggraph(bigram_graph, layout = "fr") +
# LINK layer
geom_edge_link(aes(edge_alpha = n), # transparency of link based on n
show.legend = FALSE,
# direction
arrow = a,
# arrow to end before touch node
end_cap = circle(.08, 'inches')) +
# NODE layer
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name),
vjust = 1, hjust = 1,
check_overlap = TRUE,
repel = FALSE # adds more lines
) +
# THEME
theme_void() +
ggtitle("Word Network in WDR's abstracts")
abs_bigram_graph
abs_bigram_graph %T>%
print() %T>%
ggsave(., filename = here("analysis", "output", "figures", "abs_bigram_graph.pdf"),
#width = 4, height = 2.25, units = "in",
device = cairo_pdf) %>%
ggsave(., filename = here("analysis", "output", "figures", "abs_bigram_graph.png"),
#width = 4, height = 2.25, units = "in",
type = "cairo", dpi = 300) — Counting and correlating among sections
Notes for “Text Mining with R: A Tidy Approach”
The widyr package makes operations such as computing counts and correlations easy, by simplifying the pattern of “widen data -> perform an operation -> then re-tidy data”. We’ll focus on a set of functions that make pairwise comparisons between groups of observations (for example, between documents, or sections of text).
# divide abstracts into 5-line sections
abs_section_words <- abs_all %>%
mutate(text = stringi::stri_split_lines(abstract, omit_empty = FALSE)
) %>%
#filter(date_issued == "1978") %>%
mutate(section = row_number(.$abstract) %/% 5) %>%
filter(section > 0) %>%
unnest_tokens(output = word,
input = abstract) %>%
filter(!word %in% stop_words$word)widyr::pairwise_counts() counts the number of times each pair of items (words) appear together within a group defined by “feature” (section). > note it still returns a tidy data frame, although the underlying computation took place in a matrix form :
abs_section_words %>%
widyr::pairwise_count(item = word, feature = section, sort = TRUE) %>%
# Since pairwise_count records both the counts of (word_A, word_B) and
#(word_B, word_B), it does not matter we filter at item1 or item2
filter(item1 == "developing")— Pairwise correlation
We may want to examine correlation among words, which indicates how often they appear together relative to how often they appear separately.
we compute the \(\phi\) coefficient. Introduced by Karl Pearson, this measure is similar to the Pearson correlation coefficient in its interpretation. In fact, a Pearson correlation coefficient estimated for two binary variables will return the \(\phi\) coefficient. The phi coefficient is related to the chi-squared statistic for a 2 × 2 contingency table
\[ \phi = \sqrt{\frac{\chi^2}{n}} \]
where \(n\) denotes sample size. In the case of pairwise counts, \(\phi\) is calculated by
\[ \phi = \frac{n_{11}n_{00} - n_{10}n_{01}}{\sqrt{n_{1·}n_{0·}n_{·1}n_{·0}}} \]
We see, from the above equation, that \(\phi\) is “standardized” by individual counts, so various word pair with different individual frequency can be compared to each other:
The computation of \(\phi\) can be simply done by pairwise_cor (other choice of correlation coefficients specified by method). The procedure can be somewhat computationally expensive, so we filter out uncommon words
Which word is most correlated with “poor”? [health,people,governments, data ]
This lets us pick particular interesting words and find the other words most associated with them
source("R/f_facetted_bar_plot.R")
p_ass_words <- word_cors %>%
filter(item1 %in% c( "people", "governments", "markets", "institutions")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
facet_bar(y = item2, x = correlation, by = item1) +
labs(title="Words most correlated to selected words of interest",
subtitle="(Taken from WDRs' abstracts)",
#caption="Source: ????",
)
p_ass_words %T>%
print() %T>%
ggsave(., filename = here("analysis", "output", "figures", "p_ass_words.pdf"),
#width = 4, height = 2.25, units = "in",
device = cairo_pdf) %>%
ggsave(., filename = here("analysis", "output", "figures", "p_ass_words.png"),
#width = 4, height = 2.25, units = "in",
type = "cairo", dpi = 300) How about a network visualization to see the overall correlation pattern?
word_cors %>%
filter(correlation > .15) %>%
tidygraph::as_tbl_graph() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE)Note that unlike the bigram analysis, the relationships here are symmetrical, rather than directional (there are no arrows).
III.iv) Concordances -> KWIC -> Collocation
- {Following LADAL Tutorial}
- {Following Ben Schmidt, Chp 8.2.3}
- https://alvinntnu.github.io/NTNU_ENC2036_LECTURES/corpus-analysis-a-start.html
In the language sciences, concordancing refers to the extraction of words from a given text or texts. Concordances are commonly displayed in the form of keyword-in-context displays (KWICs) where the search term is shown in context, i.e. with preceding and following words.
Concordancing is central to analyses of text and they often represents the first step in more sophisticated analyses of language data, because concordances are extremely valuable for understanding how a word or phrase is used, how often it is used, and in which contexts is used. As concordances allow us to analyze the context in which a word or phrase occurs and provide frequency information about word use, they also enable us to analyze collocations or the collocational profiles of words and phrases (Stefanowitsch 2020, 50–51). Finally, concordances can also be used to extract examples and it is a very common procedure.
— Concordances
- https://ladal.edu.au/textanalysis.html#Concordancing
- https://www.quantumjitter.com/project/deal/
Using
quanteda
— create kwic with individual keyword | purrr + print + save png
# I use again data = abs_words
abs_q_corpus <- quanteda::corpus(as.data.frame(abs_all),
docid_field = "title",
text_field = "abstract",
meta = list("id", "date_issued")
)
# --- example with individual keyword
# Step 1) tokens
abs_q_tokens <- tokens(x = abs_q_corpus,
remove_punct = TRUE,
remove_symbols = TRUE#,remove_numbers = TRUE
)
# Step 2) kwic (individual exe )
# kwic_abs_data <- quanteda::kwic(x = abs_q_tokens, # define text(s)
# # define pattern
# pattern = phrase(c("data", "knowledge")),
# # define window size
# window = 5) %>%
# # convert into a data frame
# as_tibble() %>%
# left_join(abs_all, by = c("docname" = "title")) %>%
# # remove superfluous columns
# dplyr::select( 'Year' = date_issued, 'WDR title' = docname, pre, keyword, post) %>%
# # slice_sample( n = 50) %>%
# kbl(align = "c") # %>% kable_styling()
# Step 2) kwic (on vector)
# Iterate `quanteda::kwic` over a vector of tokens | regex-modified-keywords
keywords <- c("data", "globalization", "sustainab*", "conditionalit*", "regulat*", "ODA" )
# apply iteratively kwic over a vector of keywords
outputs_key <- map(keywords,
~quanteda::kwic(abs_q_tokens,
pattern = .x,
window = 5) %>%
as_tibble() %>%
left_join(abs_all, by = c("docname" = "title")) %>%
# remove superfluous columns
dplyr::select( 'Year' = date_issued, 'WDR title' = docname, pre, keyword, post)
)
# # all togetha 3
n = length(keywords)
# outputs_key[[1]] %>%
# kbl(align = "c")
# this list has no element names
names(outputs_key)
n = length(keywords)
# set names for elements
outputs_key <- outputs_key %>%
set_names(paste0("kwic_", keywords))
# get rid of empty output dfs in list
outputs_key <- outputs_key[sapply(
outputs_key, function(x) dim(x)[1]) > 0] # 4 left!
# -------------- print all
# Modo 1 - walk + print -
walk(.x = outputs_key, .f = print)
# Modo 2 - walk + kbl -
#walk(.x = outputs_key, .f = kbl)
# # Modo 3 - imap??? + kbl -
# purrr::imap(.x = outputs_key,
# .f = ~ {
# kbl(x = .x,
# align = "c",
# #format = "html",
# caption =.y
# ) # %>% kable_styling()
# }
# )
# MODO 4 -> create multiple tables from a single dataframe and save them as images
# https://stackoverflow.com/questions/69323569/how-to-save-multiple-tables-as-images-using-kable-and-map/69323893#69323893
outputs_key %>%
imap(~save_kable(file = paste0('analysis/output/tables/', .y, '_.png'),
# bs_theme = 'journal',
self_contained = T,
x = kbl(.x, booktabs = T, align = c('l','l', 'c')) %>%
kable_styling()
)
)— create kwic with phrases | purrr + print + save png
# Iterate `quanteda::kwic` over a vector of phrases/bigrams
keywords_phrase <- c("climate change", "investment climate", "pro-poor",
"gender equality", "maximizing finance", "digital revolution",
"private finance")
# Step 1) tokens
# (done above) -> abs_q_tokens
# Step 2) kwic
# apply iteratively kwic over a vector of bigrams
outputs_bigrams <- map(keywords_phrase,
~quanteda::kwic(x = abs_q_tokens, # define text(s)
# define pattern
pattern = quanteda::phrase(.x),
# define window size
window = 5) %>%
# convert into a data frame
as_tibble() %>%
left_join(abs_all, by = c("docname" = "title")) %>%
# remove superfluous columns
dplyr::select( 'Year' = date_issued,
'WDR title' = docname, pre, keyword, post)
)
# number ofo cbigrams
n_bi = length(keywords_phrase)
n_bi # 7
# name this list's elements
outputs_bigrams <- outputs_bigrams %>%
set_names(paste0("kwic_", keywords_phrase))
# get rid of empty output dfs in list
outputs_bigrams2 <- outputs_bigrams[sapply(
outputs_bigrams, function(x) dim(x)[1]) > 0] # 4 left!
#or
outputs_bigrams3 <- purrr::keep(outputs_bigrams, ~nrow(.) > 0) # 4 left!
# -------------- print all
# walk + print -
walk(.x = outputs_bigrams2, .f = print)
# -------------- save all -> create multiple tables from a single dataframe and save them as images
# https://stackoverflow.com/questions/69323569/how-to-save-multiple-tables-as-images-using-kable-and-map/69323893#69323893
outputs_bigrams2 %>%
imap(~save_kable(file = paste0('analysis/output/tables/', .y, '_.png'),
# bs_theme = 'journal',
self_contained = T,
x = kbl(.x, booktabs = T, align = c('l','l', 'c')) %>%
kable_styling()
)
)— Collocation
- https://ladal.edu.au/coll.html#2_Finding_Collocations
Collocations are words that are attracted to each other (and that co-occur or co-locate together), e.g., Merry Christmas, Good Morning, No worries. Any word in any given language has collocations, i.e., others words that are attracted/attractive to that word. This allows us to anticipate what word comes next and collocations are context/text type specific. There are various different statistical measures are used to define the strength of the collocations, like the Mutual Information (MI) score and log-likelihood (see here for an over view of different association strengths measures).
–> EXE: Collocation for subset on poverty WDR
- In a first step, we will split the Abstract into individual sentences.
# reduce to just one long concatenated string
abs_pov <- abs_all %>%
dplyr::filter(date_issued %in% c("1978", "1980", "1990", "2001")) %>%
select( abstract) %>%
summarize(text = str_c(abstract, collapse = ". ")) %>%
as.character()
# read in and process text
abs_pov_sentences <- abs_pov %>%
stringr::str_squish() %>%
# divide into sentences
tokenizers::tokenize_sentences(.) %>%
unlist() %>%
stringr::str_remove_all("- ") %>%
stringr::str_replace_all("\\W", " ") %>%
stringr::str_squish()
# inspect data
head(abs_pov_sentences)In a next step, we will create a matrix that shows how often each word co-occurred with each other word in the data.
# convert into corpus
abs_pov_corpus <- Corpus(VectorSource(abs_pov_sentences))
# create vector with words to remove
extrawords <- c("the", "can", "get", "got", "can", "one",
"dont", "even", "may", "but", "will",
"much", "first", "but", "see", "new",
"many", "less", "now", "well", "like",
"often", "every", "said", "two")
# clean corpus
abs_pov_corpus_clean <- abs_pov_corpus %>%
tm::tm_map(removePunctuation) %>%
tm::tm_map(removeNumbers) %>%
tm::tm_map(tolower) %>%
tm::tm_map(removeWords, stopwords()) %>%
tm::tm_map(removeWords, extrawords)
# create document term matrix
abs_pov_dtm <- DocumentTermMatrix(
abs_pov_corpus_clean,
control=list(bounds = list(global=c(1, Inf)),
weighting = weightBin))
# convert dtm into sparse matrix
abs_pov_sdtm <- Matrix::sparseMatrix(i = abs_pov_dtm$i, j = abs_pov_dtm$j,
x = abs_pov_dtm$v,
dims = c(abs_pov_dtm$nrow, abs_pov_dtm$ncol),
dimnames = dimnames(abs_pov_dtm))
# calculate co-occurrence counts
coocurrences <- t(abs_pov_sdtm) %*% abs_pov_sdtm
# convert into matrix
collocates <- as.matrix(coocurrences)We can inspect this co-occurrence matrix and check how many terms (words or elements) it represents using the ncol function from base R. We can also check how often terms occur in the data using the summary function from base R.
The ncol function reports that the data represents 239 words and that the most frequent word occurs 163 times in the text.
The output of the summary function tells us that the minimum frequency of a word in the data is 5 with a maximum of 163. The difference between the median (18) and the mean (22) indicates that the frequencies are distributed non-normally - which is common for language data.
–> (EXE) Visualizing Collocations EXE “poverty”
We will now use an example of one individual word ( poverty ) to show, how collocation strength for individual terms is calculated and how it can be visualized.
The function calculateCoocStatistics is taken from Wiedemann and Niekler (n.d.) and applied to the abs_pov_sdtm SPARSE DOCUMENT TEXT MATRIX
Visualizing Collocations
# load function for co-occurrence calculation
source("https://slcladal.github.io/rscripts/calculateCoocStatistics.R")
# define term
coocTerm <- "development"
# calculate co-occurrence statistics
coocs <- calculateCoocStatistics(coocTerm, abs_pov_sdtm, measure="LOGLIK")
# inspect results
coocs[1:20]
# define term # 2
coocTerm2 <- "poverty"
# calculate co-occurrence statistics
coocs2 <- calculateCoocStatistics(coocTerm2, abs_pov_sdtm, measure="LOGLIK")
# inspect results
coocs2[1:20]The output shows that the word most strongly associated with development in the poverty WDR subset is issues - here there is no substantive strength (a substantive strength of the association would indicate these term are definitely collocates and almost - if not already - a lexicalized construction)
–> (EXE) Association Strength
There are various visualizations options for collocations. Which visualization method is appropriate depends on what the visualizations should display.
We start with the most basic and visualize the collocation strength using a simple dot chart. We use the vector of association strengths generated above and transform it into a table. Also, we exclude elements with an association strength lower than 30.
…[viz] association strengths
We can now visualize the association strengths as shown in the code chunk below.
p_ass_words_poverty <- ggplot(coocdf, aes(x = reorder(Term, CollStrength, mean), y = CollStrength)) +
geom_point() +
coord_flip() +
#theme_void() +
labs(title = "Association to word \"poverty\"",
subtitle = "Collocation strenght measured by log-likelihood",
caption = "Source: https://ladal.edu.au/coll.html#Association_Strength",
y = "",
x = ""
)
p_ass_words_poverty %T>%
print() %T>%
ggsave(., filename = here("analysis", "output", "figures", "p_ass_words_poverty.pdf"),
#width = 4, height = 2.25, units = "in",
device = cairo_pdf) %>%
ggsave(., filename = here("analysis", "output", "figures", "p_ass_words_poverty.png"),
#width = 4, height = 2.25, units = "in",
type = "cairo", dpi = 300) The dot chart shows that poverty is collocating more strongly with economic compared to any other term.
–> (EXE) Dendrograms
Another method for visualizing collocations are dendrograms. Dendrograms (also called tree-diagrams) show how similar elements are based on one or many features. As such, dendrograms are used to indicate groupings as they show elements (words) that are notably similar or different with respect to their association strength. To use this method, we first need to generate a distance matrix from our co-occurrence matrix.
coolocs <- c(coocdf$Term, "poverty")
# remove non-collocating terms
collocates_redux <- collocates[rownames(collocates) %in% coolocs, ]
collocates_redux <- collocates_redux[, colnames(collocates_redux) %in% coolocs]
# create distance matrix
distmtx <- dist(collocates_redux)
clustertexts <- hclust( # hierarchical cluster object
distmtx, # use distance matrix as data
method="ward.D2") # ward.D as linkage method
ggdendrogram(clustertexts) +
ggtitle("Terms strongly collocating with *poverty*")–> (EXE) Network Graphs
Network graphs are a very useful tool to show relationships (or the absence of relationships) between elements. Network graphs are highly useful when it comes to displaying the relationships that words have among each other and which properties these networks of words have.
–> (EXE) Basic Network Graphs
In order to display a network, we need to create a network graph by using the network function from the network package.
net = network::network(collocates_redux,
directed = FALSE,
ignore.eval = FALSE,
names.eval = "weights")
# vertex names
network.vertex.names(net) = rownames(collocates_redux)
# inspect object
netNow that we have generated a network object, we visualize the network with GGally::ggnet2.
We can customize the network object so that the visualization becomes more appealing and informative. To add information, we create vector of words that contain different groups, e.g. terms that rarely, sometimes, and frequently collocate with poverty (I used the dendrogram which displayed the cluster analysis as the basis for the categorization).
Based on these vectors, we can then change or adapt the default values of certain attributes or parameters of the network object (e.g. weights. linetypes, and colors).
# create vectors with collocation occurrences as categories
mid <- c("dimensions", "major", "developing", "social", "health")
high <- c("economic", "countries")
infreq <- colnames(collocates_redux)[!colnames(collocates_redux) %in% mid & !colnames(collocates_redux) %in% high]
# add color by group
net %v% "Collocation" = ifelse(network.vertex.names(net) %in% infreq, "weak",
ifelse(network.vertex.names(net) %in% mid, "medium",
ifelse(network.vertex.names(net) %in% high, "strong", "other")))
# modify color
net %v% "color" = ifelse(net %v% "Collocation" == "weak", "gray60",
ifelse(net %v% "Collocation" == "medium", "orange",
ifelse(net %v% "Collocation" == "strong", "indianred4", "gray60")))
# rescale edge size
network::set.edge.attribute(net, "weights", ifelse(net %e% "weights" < 1, 0.1,
ifelse(net %e% "weights" <= 2, .5, 1)))
# define line type
network::set.edge.attribute(net, "lty", ifelse(net %e% "weights" <=.1, 3,
ifelse(net %e% "weights" <= .5, 2, 1)))We can now display the network object and make use of the added information.
p_ggnet_poverty <- GGally::ggnet2(net,
color = "color",
label = TRUE,
label.size = 4,
alpha = 0.2,
size = "degree",
edge.size = "weights",
edge.lty = "lty",
edge.alpha = 0.2) +
guides(color = FALSE, size = FALSE) +
#theme_void() +
labs(title = "Degrees of association to word \"poverty\"",
subtitle = "Weak (grey), medium (orange), strong (red)"#,
# caption = "Source: https://ladal.edu.au/coll.html#Association_Strength",
# y = "",
# x = ""
)
p_ggnet_poverty %T>%
print() %T>%
ggsave(., filename = here("analysis", "output", "figures", "p_ggnet_poverty.pdf"),
#width = 4, height = 2.25, units = "in",
device = cairo_pdf) %>%
ggsave(., filename = here("analysis", "output", "figures", "p_ggnet_poverty.png"),
#width = 4, height = 2.25, units = "in",
type = "cairo", dpi = 300) –> (EXE) Biplots
An alternative way to display co-occurrence patterns are bi-plots which are used to display the results of Correspondence Analyses. They are useful, in particular, when one is not interested in one particular key term and its collocations but in the overall similarity of many terms. Semantic similarity in this case refers to a shared semantic and this distributional profile. As such, words can be deemed semantically similar if they have a similar co-occurrence profile - i.e. they co-occur with the same elements. Biplots can be used to visualize collocations because collocates co-occur and thus share semantic properties which renders then more similar to each other compared with other terms.
# perform correspondence analysis
res.ca <- FactoMineR::CA(collocates_redux, graph = FALSE)
# plot results
factoextra::fviz_ca_row(res.ca, repel = TRUE, col.row = "gray20")The bi-plot shows that poverty and development collocate as they are plotted in close proximity. The advantage of the biplot becomes apparent when we focus on other terms because the biplot also shows other collocates such as issues and growth
–> (EXE) Determining Significance
In order to identify which words occur together significantly more frequently than would be expected by chance, we have to determine if their co-occurrence frequency is statistical significant. This can be done wither for specific key terms or it can be done for the entire data. In this example, we will continue to focus on the key word selection.
To determine which terms collocate significantly with the key term (selection), we use multiple (or repeated) Fisher’s Exact tests which require the following information:
a = Number of times
coocTermoccurs with term jb = Number of times
coocTermoccurs without term jc = Number of times other terms occur with term j
d = Number of terms that are not
coocTermor term j
In a first step, we create a table which holds these quantities.
# convert to data frame
coocdf <- as.data.frame(as.matrix(collocates))
# reduce data
diag(coocdf) <- 0
coocdf <- coocdf[which(rowSums(coocdf) > 10),]
coocdf <- coocdf[, which(colSums(coocdf) > 10)]
# extract stats
cooctb <- coocdf %>%
dplyr::mutate(Term = rownames(coocdf)) %>%
tidyr::gather(CoocTerm, TermCoocFreq,
colnames(coocdf)[1]:colnames(coocdf)[ncol(coocdf)]) %>%
dplyr::mutate(Term = factor(Term),
CoocTerm = factor(CoocTerm)) %>%
dplyr::mutate(AllFreq = sum(TermCoocFreq)) %>%
dplyr::group_by(Term) %>%
dplyr::mutate(TermFreq = sum(TermCoocFreq)) %>%
dplyr::ungroup(Term) %>%
dplyr::group_by(CoocTerm) %>%
dplyr::mutate(CoocFreq = sum(TermCoocFreq)) %>%
dplyr::arrange(Term) %>%
dplyr::mutate(a = TermCoocFreq,
b = TermFreq - a,
c = CoocFreq - a,
d = AllFreq - (a + b + c)) %>%
dplyr::mutate(NRows = nrow(coocdf))We now select the key term (poverty). If we wanted to find all collocations that are present in the data, we would use the entire data rather than only the subset that contains poverty.
Next, we calculate which terms are (significantly) over- and under-proportionately used with poverty. It is important to note that this procedure informs about both: over- and under-use! This is especially crucial when analyzing if specific words are attracted o repelled by certain constructions. Of course, this approach is not restricted to analyses of constructions and it can easily be generalized across domains and has also been used in machine learning applications.
coocStatz <- cooctb_redux %>%
dplyr::rowwise() %>%
dplyr::mutate(p = as.vector(unlist(fisher.test(matrix(c(a, b, c, d),
ncol = 2, byrow = T))[1]))) %>%
dplyr::mutate(x2 = as.vector(unlist(chisq.test(matrix(c(a, b, c, d), ncol = 2, byrow = T))[1]))) %>%
dplyr::mutate(phi = sqrt((x2/(a + b + c + d)))) %>%
dplyr::mutate(expected = as.vector(unlist(chisq.test(matrix(c(a, b, c, d), ncol = 2, byrow = T))$expected[1]))) %>%
dplyr::mutate(Significance = dplyr::case_when(p <= .001 ~ "p<.001",
p <= .01 ~ "p<.01",
p <= .05 ~ "p<.05",
FALSE ~ "n.s."))We now add information to the table and remove superfluous columns s that the table can be more easily parsed.
coocStatz <- coocStatz %>%
dplyr::ungroup() %>%
dplyr::arrange(p) %>%
dplyr::mutate(j = 1:n()) %>%
# perform benjamini-hochberg correction
dplyr::mutate(corr05 = ((j/NRows)*0.05)) %>%
dplyr::mutate(corr01 = ((j/NRows)*0.01)) %>%
dplyr::mutate(corr001 = ((j/NRows)*0.001)) %>%
# calculate corrected significance status
dplyr::mutate(CorrSignificance = dplyr::case_when(p <= corr001 ~ "p<.001",
p <= corr01 ~ "p<.01",
p <= corr05 ~ "p<.05",
FALSE ~ "n.s.")) %>%
dplyr::mutate(p = round(p, 6)) %>%
dplyr::mutate(x2 = round(x2, 1)) %>%
dplyr::mutate(phi = round(phi, 2)) %>%
dplyr::arrange(p) %>%
dplyr::select(-a, -b, -c, -d, -j, -NRows, -corr05, -corr01, -corr001) %>%
dplyr::mutate(Type = ifelse(expected > TermCoocFreq, "Antitype", "Type"))The results show that poverty DOES NOT collocates significantly with anywords.
–> (EXE) Changes in Collocation Strength
–> (EXE) Collostructional Analysis
III.vi) > > Sentiment Analysis
https://cfss.uchicago.edu/notes/harry-potter-exercise/
III.v) Topic modeling
- Robinson, Silge
- https://cfss.uchicago.edu/notes/topic-modeling/
- https://m-clark.github.io/text-analysis-with-R/topic-modeling.html
- AH: https://datavizf18.classes.andrewheiss.com/class/11-class/#topic-modeling
[not sure applicable, they are all same topic here!]
Topic modeling is a method for unsupervised classification of documents (blog post, news articles), similar to clustering on numeric data, which finds natural groups of items even when we’re not sure what we’re looking for.
Methods:
- Latent Dirichlet allocation (LDA) is a particularly popular method for fitting a topic model -> It treats each document as a mixture of topics, and each topic as a mixture of words. The basic idea is that we’ll take a whole lot of features and boil them down to a few ‘topics’. In this sense LDA is akin to discrete PCA.
— LDA (Latent Dirichlet allocation) with topicmodels package
NOTE: The
topicmodelspackage takes a Document-Term Matrix as input and produces a model that can be tided by tidytext, such that it can be manipulated and visualized with dplyr and ggplot2.
Principles:
- imagine that each document may contain words from several topics in particular proportions
- Every topic is a mixture of words
LDA is a mathematical method for estimating both of these at the same time: finding the mixture of words that is associated with each topic, while also determining the mixture of topics that describes each document.
— From abstracts tiditext 2 dtm with tidytext
# starting from this
paint(abs_words2)
# cast into a Document-Term Matrix (*)
abs_words_dtm <- abs_words2 %>%
tidytext::cast_dtm(date_issued, word, n)
abs_words_dtm
# cast into a Term-Document Matrix
abs_words_tdm <- abs_words2 %>%
tidytext::cast_tdm(date_issued, word, n)
abs_words_tdm
# cast into quanteda's dfm Document-feature matrix
abs_words_dfm <- abs_words2 %>%
cast_dfm(date_issued, word, n)
# cast into a Matrix object
abs_words_m <- abs_words2 %>%
cast_sparse(date_issued, word, n)
class(abs_words_m)— …from dtm 2 LDA document structure
https://cfss.uchicago.edu/notes/topic-modeling/
# from tidytext format (one-row-per-token)
# ---- 1/2 cast into a Document-Term Matrix (*)
abs_words_dtm <- abs_words2 %>%
tidytext::cast_dtm(date_issued, word, n)
abs_words_dtm
# # ---- 2/2 using Document-Term Matrix (*)
# # set a seed so that the output of the model is predictable
# # k is the number of topic
abs_lda <- topicmodels::LDA(abs_words_dtm, k = 2, control = list(seed = 1234))
abs_ldaFitting the model was the “easy part”: the rest of the analysis will involve exploring and interpreting the model using tidying functions from the tidytext package.
NOTE: What if k change? Several different values for may be plausible, but by increasing we sacrifice clarity.
— Word-topic probabilities
The tidytext package uses broom::tidy for extracting the per-topic-per-word probabilities, called β (“beta”), from the model.
NOTE: For each combination, the model computes the probability of that term being generated from that topic.
# extract per-topic-per-word beta
abs_topics <- tidytext::tidy(abs_lda, matrix = "beta")
abs_topics # one-topic-per-term-per-row format
#> For example, the term “data” has a 8.33×10−12 probability of being generated from topic 1, but a 1.1×10−3 probability of being generated from topic 2.We could use dplyr’s slice_max() to find the 10 terms that are most common within each topic. As a tidy data frame, this lends itself well to a ggplot2 visualization
abs_top_terms <- abs_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
abs_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()In alternative, we could consider the terms that had the greatest difference in \(\beta\) between topic 1 and topic 2. This can be estimated based on the log ratio of the two: \(\log_2(\frac{\beta_2}{\beta_1})\) (a log ratio is useful because it makes the difference symmetrical: \(\beta_2\) being twice as large leads to a log ratio of 1, while \(\beta_1\) being twice as large results in -1). To constrain it to a set of especially relevant words, we can filter for relatively common words, such as those that have a \(\beta\) greater than 1/1000 in at least one topic.
The words with the greatest differences between the two topics are visualized in Figure @ref(fig:topiccompare).
(ref:topiccap) Words with the greatest difference in \(\beta\) between topic 2 and topic 1
— Document-topic probabilities
Besides estimating each topic as a mixture of words, LDA also models each document as a mixture of topics. We can examine the per-document-per-topic probabilities, called \(\gamma\) (“gamma”), with the matrix = "gamma" argument to tidy().
abs_documents <- tidy(abs_lda, matrix = "gamma")
abs_documents # 44 WDR x 2 topics = 88!Each of these values is an estimated proportion of words from that document that are generated from that topic. For example, the model estimates that only about percent(abs_documents$gamma[1]) of the words in document 1 were generated from topic 1.
We can see that many of these documents were drawn from a mix of the two topics, but that document 2014 was drawn almost entirely from topic 1, having a \(\gamma\) from topic 2 close to zero. To check this answer, we could tidy() the document-term matrix (see Chapter @ref(tidy-dtm)) and check what the most common words in that document were.
Based on the most common words, this appears to be an article about the relationship between the American government and Panamanian dictator Manuel Noriega, which means the algorithm was right to place it in topic 2 (as political/national news).
————– STOP ——————
— Alternative LDA implementations
III.vii) > > Supervised classification with text data
https://cfss.uchicago.edu/notes/supervised-text-classification/ we can now use machine learning models to classify text into specific sets of categories. This is known as supervised learning.
Reference Tutorials
Robinson and Silge (2022)
Benjamin Soltoff: Computing 4 Social Sciences - API
Benjamin Soltoff: Computing 4 Social Sciences - text analysis
Ben Schmidt Book Humanities Crurse Ben Schmidt Book Humanities
- ✔️ MEDIUM articles: common words, pairwise correlations - 2018-12-04
- ✔️ TidyTuesday Tweets - 2019-01-07
- Wine Ratings - 2019-05-31 Lasso regression | sentiment lexicon,
- Simpsons Guest Stars 2019-08-30 geom_histogram
- Horror Movies 2019-10-22 explaining glmnet package | Lasso regression
- The Office 2020-03-16 geom_text_repel from ggrepel | glmnet package to run a cross-validated LASSO regression
- Animal Crossing 2020-05-05 Using geom_line and geom_point to graph ratings over time | geom_text to visualize what words are associated with positive/negative reviews |topic modelling